import os
from datasets import Dataset, load_dataset, concatenate_datasets
from tqdm import tqdm
from verl.utils.hdfs_io import copy, makedirs
import argparse
import json
import pickle
from openai import OpenAI
import random 
import time

def Evaluate(prompt:str, model_name, max_retries=30):
        
    for i in range(max_retries):
        try:
            client = OpenAI(
                api_key="",
                base_url="",
            )

            completion = client.chat.completions.create(
                model=model_name,  
                messages=[
                    {'role': 'system', 'content': 'You are a helpful assistant.'},
                    {'role': 'user', 'content': prompt}
                ],
                temperature = 0
            )
            print(completion.choices[0].message.content)
            output = completion.choices[0].message.content
                #tokens = completion.usage
            return output
        except Exception as e:
            if i == max_retries - 1:  # If this was the last attempt
                raise  # re-throw the last exception
            else:
                # Wait for a bit before retrying and increase the delay each time
                sleep_time = (2 ** i) + random.random()  # Exponential backoff with full jitter
                time.sleep(sleep_time) 


dataset_test_tomi = load_dataset("json", data_files=".../data/test_balanced.json", split="train[800:1000]")
print(dataset_test_tomi)

dataset_test_hitom_1 = load_dataset(".../ToM_data/Hi-ToM", split="train[60:80]")
dataset_test_hitom_2 = load_dataset(".../ToM_data/Hi-ToM", split="train[160:180]")
dataset_test_hitom_3 = load_dataset(".../ToM_data/Hi-ToM", split="train[260:280]")

dataset_test_hitom_4 = load_dataset(".../ToM_data/Hi-ToM", split="train[660:680]")
dataset_test_hitom_5 = load_dataset(".../ToM_data/Hi-ToM", split="train[760:780]")
dataset_test_hitom_6 = load_dataset(".../ToM_data/Hi-ToM", split="train[860:880]")

dataset_test_hitom_third = concatenate_datasets([dataset_test_hitom_1, dataset_test_hitom_2, dataset_test_hitom_3, dataset_test_hitom_4, dataset_test_hitom_5, dataset_test_hitom_6])
print(dataset_test_hitom_third)

dataset_test_hitom_7 = load_dataset(".../ToM_data/Hi-ToM", split="train[80:100]")
dataset_test_hitom_8 = load_dataset(".../ToM_data/Hi-ToM", split="train[180:200]")
dataset_test_hitom_9 = load_dataset(".../ToM_data/Hi-ToM", split="train[280:300]")

dataset_test_hitom_10 = load_dataset(".../ToM_data/Hi-ToM", split="train[680:700]")
dataset_test_hitom_11 = load_dataset(".../ToM_data/Hi-ToM", split="train[780:800]")
dataset_test_hitom_12 = load_dataset(".../ToM_data/Hi-ToM", split="train[880:900]")

dataset_test_hitom_fourth = concatenate_datasets([dataset_test_hitom_7, dataset_test_hitom_8, dataset_test_hitom_9, dataset_test_hitom_10, dataset_test_hitom_11, dataset_test_hitom_12])
print(dataset_test_hitom_fourth)

dataset_test_exploretom = load_dataset("csv", data_files=".../ToM_data/ExploreToM/ExploreToM-data-sample.csv", split="train[2000:2300]")
print(dataset_test_exploretom)

dataset_test_ToMbench = load_dataset("json", data_files=".../ToMbench_data/test_combined.json", split="train[:431]")
print(dataset_test_ToMbench)

dataset_test_socialqa = load_dataset("json", data_files=".../SocialIqa/socialIWa_v1.4_tst_wDims.json", split="train[:120]")
print(dataset_test_socialqa)

dataset_test_simpletom_mental = load_dataset("json", data_files=".../SimpleToM/mental-state-qa/test.json", split="train[:120]")
print(dataset_test_simpletom_mental)

dataset_test_simpletom_behavior = load_dataset("json", data_files=".../SimpleToM/behavior-qa/test.json", split="train[:120]")
print(dataset_test_simpletom_behavior)

dataset_test_simpletom_judgment = load_dataset("json", data_files=".../SimpleToM/judgment-qa/test.json", split="train[:120]")
print(dataset_test_simpletom_judgment)

dataset_test_tomato_first = load_dataset("json", data_files=".../ToMATO/dataset/tomato_first.json", split="train[:25]")
print(dataset_test_tomato_first)

dataset_test_tomato_second = load_dataset("json", data_files=".../ToMATO/dataset/tomato_second.json", split="train[:25]")
print(dataset_test_tomato_second)

dataset_test_opentom_attitude = load_dataset("json", data_files=".../OpenToM/merged_attitude_data.json")
dataset_test_opentom_attitude = dataset_test_opentom_attitude['train']
print(dataset_test_opentom_attitude)

dataset_test_opentom_location_cg_fo = load_dataset("json", data_files=".../OpenToM/merged_location_cg_fo_data.json")
dataset_test_opentom_location_cg_fo = dataset_test_opentom_location_cg_fo['train']
print(dataset_test_opentom_location_cg_fo)

dataset_test_opentom_location_cg_so = load_dataset("json", data_files=".../OpenToM/merged_location_cg_so_data.json")
dataset_test_opentom_location_cg_so = dataset_test_opentom_location_cg_so['train']
print(dataset_test_opentom_location_cg_so)


answer_list = []
for example in dataset_test_tomi:
    answer_list.append(example["answer"])

for example in dataset_test_hitom_third:
    answer_list.append(example["answer"])

for example in dataset_test_hitom_fourth:
    answer_list.append(example["answer"])

for example in dataset_test_exploretom:
    answer_list.append(example["expected_answer"])

for example in dataset_test_ToMbench:
    if example["答案\nANSWER"] == 'A':
        answer = example["OPTION-A"]
    elif example["答案\nANSWER"] == 'B':
        answer = example["OPTION-B"]
    elif example["答案\nANSWER"] == 'C':
        answer = example["OPTION-C"]
    else:
        answer = example["OPTION-D"]
    answer_list.append(answer)

for example in dataset_test_socialqa:
    if example["label_letter"] == "A":
        answer = example["answerA"]
    elif example["label_letter"] == "B":
        answer = example["answerB"]
    else:
        answer = example["answerC"]
    
    answer_list.append(answer)

for example in dataset_test_simpletom_mental:
    data = example["choices"]
    if example["answerKey"] == 'A':
        answer = data["text"][0]
    else:
        answer = data["text"][1]

    answer_list.append(answer)

for example in dataset_test_simpletom_behavior:
    data = example["choices"]
    if example["answerKey"] == 'A':
        answer = data["text"][0]
    else:
        answer = data["text"][1]
        
    answer_list.append(answer)

for example in dataset_test_simpletom_judgment:
    data = example["choices"]
    if example["answerKey"] == 'A':
        answer = data["text"][0]
    else:
        answer = data["text"][1]
        
    answer_list.append(answer)

for example in dataset_test_tomato_first:
    answer_list.append(example["a_str"])

for example in dataset_test_tomato_second:
    answer_list.append(example["a_str"])

for example in dataset_test_opentom_attitude:
    answer_list.append(example["answer"])

for example in dataset_test_opentom_location_cg_fo:
    answer_list.append(example["answer"])

for example in dataset_test_opentom_location_cg_so:
    answer_list.append(example["answer"])

print(answer_list)

print(len(answer_list))

with open('', 'rb') as f:
    long_thought_list = pickle.load(f)

true_list = []

for i in range(len(answer_list)):
    model_generation = long_thought_list[i]
    answer = answer_list[i]
    prompt = f"""\
This is someone's response [{model_generation}]:


This is the correct answer:

[{answer}]

Is final answer correct? Output 'True' or 'False' only.
"""

    graded_answer = Evaluate(prompt, 'deepseek-v3')
    print(graded_answer)
    def str_to_bool(s):
        return s.strip().lower() == 'true'
    graded_answer = str_to_bool(graded_answer)
    true_list.append(graded_answer)

ranges = [
    (0, 199),
    (200, 319),
    (320, 439),
    (440, 739),
    (740, 1170),
    (1171, 1290),
    (1291, 1410),
    (1411, 1530),
    (1531, 1650),
    (1651, 1675),
    (1676, 1700),
    (1701, 1725),
    (1726, 1755),
    (1756, 1785)
]


for start, end in ranges:
    segment = true_list[start:end+1]
    true_count = sum(segment)
    total = len(segment)
    ratio = true_count / total if total > 0 else 0
    print(f"Range {start}-{end}: True ratio = {ratio:.4f}")
